*00000000000000000000000000000000000000000
*FUNCTIONS USED TO LOAD DATA
*00000000000000000000000000000000000000000

*Main program that directs the program flow
capture program drop SQL_data_load
program define SQL_data_load

if "`1'" == "load_cat_data" {
	load_cat_data `2' `3'
	}
else if "`1'" == "load_refund_data" {
	load_refund_data `2' `3' `4'
	}	
else if "`1'" == "load_paycheck_data" {
	load_paycheck_data `2' `3' `4'
	}	
else if "`1'" == "load_expenditures_data" {
	load_expenditures_data `2' `3' `4'
	}	
else if "`1'" == "load_liquidity_data" {
	load_liquidity_data `2' `3' `4' `5'
	}	
else if "`1'" == "combine_data" {
	combine_data `2' `3' `4' `5' `6' 
	}
else if "`1'" == "calculate_payroll_schedule" {
	calculate_payroll_schedule `2' `3' 
	}	
else if "`1'" == "process_combined_data" {
	process_combined_data `2' `3' `4' 
	}	
else if "`1'" == "append_data" {	
	append_data `2' 
	}	
else   { 
	di "`1' is an Invalid Prodcedure"
	}

end


*IMPORT PAYCHECK DATA 
{
capture program drop load_paycheck_data
program define load_paycheck_data

local rawfile `1' /* name of the file the raw sql output is saved to */
local dtafile `2' /* name of stata file the sql output is saved to */
local partition `3' /* partition number */

#delimit ;
!echo "
	select a.user_id, 
		a.date, 
		median_day_diff,  /*median days between paycheck */
		median_amt,  /*median amount of paycheck */
		transaction_amount as paycheck_amt 
			from transactions_data_test_user_all partition(`partition') as a 
			inner join sample_selection_SC partition(`partition') as sample 
				on a.user_id = sample.user_id 
			inner join paycheck_SC partition(`partition')
				on a.user_id = paycheck_SC.user_id 
		where is_credit_bin =1 
			AND tx_status!='pending' 
			AND account_type!= 'CCARD' 
			AND transfer = 0 
			AND 
			transaction_description 
			REGEXP ('direct|dir dep|dirde p|salary|treas xxx fed|fed sal|payroll|ayroll|payrll|payrl|payrol|pr payment|adp|dfas-cleveland|dfas-in')
			and transaction_description 
			NOT REGEXP ('ing direct|refund|direct deposit advance|dir dep adv')
	group by user_id, date" 
		| mysql -uroot -pcheck123 main > `rawfile' ;
#delimit cr
				
*import the sql output to and create a stata dta file
	insheet using `rawfile', clear names
		*rename vars
			ren date date_str
				gen date = date(date_str,"YMD")
					drop date_str
					format date %td
		save `dtafile', replace
end
}

*IMPORT DAILY EXPENDITURES DATA  
{
capture program drop load_expenditures_data
program define load_expenditures_data

local rawfile `1' /* name of the file the raw sql output is saved to */
local dtafile `2' /* name of stata file the processed data is saved to */
local partition `3' /* partition number */

#delimit;
!echo "
/*Grabs daily spending, sample selection and takes out cc pay */
	select a.user_id,date,  
/*All outflows minus transfers and cc payments */
	CASE
		when is_credit_bin=0   
			AND isnull(c.tx_id) 
			AND transfer=0 
			AND not (
				lower(transaction_description) like '%brokerage%' OR
				lower(transaction_description) like '%outgoing%' OR
				lower(transaction_description) like '%transfer%' OR
				lower(transaction_description) like '%sweep%' OR
				lower(transaction_description) like '%wire%'
				)
		then a.transaction_amount 
			else 0
	END as x,
/*Daily Inflows 
excluding transfers */
	CASE
		when is_credit_bin=1 
			AND account_type!='CCARD' 
			AND transfer=0 
			AND not (
				lower(transaction_description) like '%brokerage%' OR
				lower(transaction_description) like '%outgoing%' OR
				lower(transaction_description) like '%transfer%' OR
				lower(transaction_description) like '%sweep%' OR
				lower(transaction_description) like '%wire%'
				)
			then a.transaction_amount 
		else 0
	END as y
/*Transactions data*/
        FROM main.transactions_data_test_user_all partition(`partition') as a
/* This part selects on the sample selection*/
inner join sample_selection_SC as b
	on a.user_id = b.user_id 
/*merging in cc payments (taking them out in all calculations) */
left join cc_pay_SC partition(`partition') as c
	on a.tx_id = c.tx_id AND a.user_id=c.user_id
/*ignoring pending to avoid double counting*/
	where tx_status!='pending';"
		| mysql -uroot -pcheck123 main > `rawfile';
#delimit cr
	
*import the sql output to process 
	insheet using `rawfile', clear names
*convert date to stata format
	ren date date_str
	gen date = date(date_str,"YMD")
		drop date_str
		format date %td
*collapse by user_id date
	collapse (sum) x* y* ,  by(user_id date)
	
*round to the nearest dollar
	foreach var of varlist x* y* {
		replace `var'=round(`var')			
	 }
*create temp user_int for panel 
	egen user_int_tmp = group(user_id)
*filling in missing values
	tsset user_int_tmp date
		tsfill 
		bysort user_int_tmp (date): carryforward user_id , replace // , 
		*replace missing expenditures and income vars with 0 (these are daily data so missing means there was no recorded flow)
			foreach type of varlist x* y* {
				replace `type' = 0 if mi(`type')
			}
	drop user_int_tmp
	compress
	save `dtafile', replace
end
}


*IMPORT LIQUIDITY DATA 
{
capture program drop load_liquidity_data
program define load_liquidity_data

local rawfile `1' /* name of the file the raw sql output is saved to */
local daily_dtafile `2' /* name of stata file the sql output is saved to */
local partition `3' /* partition number */

#delimit ;
	  !echo "SELECT 
			a.user_id, 
			date, 
			available_balance as liquid_balance 
				from bank_balance_part_new partition(`partition') as a 
			inner join sample_selection_SC partition(`partition') as sample 
				on a.user_id = sample.user_id 
			where (account_type = 'CHECKING' OR account_type = 'SAVING');
		" | mysql -uroot -pcheck123 main  > `rawfile';
	#delimit cr
		
	*import the sql output to stata to output table
		insheet using `rawfile', clear names
	*rename vars
		ren date date_str
			gen date = date(date_str,"YMD")
				drop date_str
				format date %td
	*collapse by user_id date and round
		collapse (sum) liquid_balance ,  by(user_id date)
		replace liquid_balance =round(liquid_balance)						
	*save down
		save `daily_dtafile', replace	
end
}


*COMBINE DATA 
{
capture program drop combine_data
program define combine_data

local paycheck_dta `1' /* name of the paycheck data file */
local expenditures_dta `2' /* name of expenditures file */
local daily_liq_dta `3' /* name of expenditures file */
local cat_dta `4' /* name of expense category file  */
local merged_dta `5' /* name of merged file  */
				
*Start with expenditures data 
	use `expenditures_dta', clear
*merge in paycheck data on matching date (will lead to some blank values since we are only matching on days on which paychecks are received)
	mmerge user_id date using `paycheck_dta', t(1:1)
		drop if _merge == 2
	gen pay = _merge == 3	
	
	*drop those with no paychecks
		egen any_match = max(_merge), by(user_id)
			drop _merge
			drop if any_match == 1	
				drop any_match
	
	*fill in user level values
		foreach var in median_amt median_day_diff {
			ren `var' `var'_tmp
			egen `var' = max(`var'_tmp), by(user_id)
				drop `var'_tmp
			}
	
	*create weekly, bi-weekly, semi-monthly or monthly freq category
	*bi-weekly is 3 because sometimes they will have 3 paychecks
		gen freq = 1 if inlist(median_day_diff,28,29,30,31,32) 
			replace freq = 2 if inlist(median_day_diff,15) 
			replace freq = 3 if inlist(median_day_diff,14) 
			replace freq = 4 if inlist(median_day_diff,7) 
			

*merge in liquidity data
	*daily balance (balance data only started in late 2013)
		mmerge user_id date using `daily_liq_dta', t(1:1) ukeep(liquid_balance)
			drop if _merge == 2
				drop _merge

*merge in category types and change missing to 0 
	mmerge user_id date using `cat_dta', t(1:1)
		drop if _merge == 2
		drop _merge
	
	foreach var of varlist x_* { 
		replace `var'  = 0 if mi(`var')
	}

*save merged data
	compress
	save `merged_dta', replace
end
}

*IMPORT EXPENSE CATEGORIES FROM PYTHON 
{
capture program drop load_cat_data
program define load_cat_data

local cat_csv `1' /* name of the category csv file */
local cat_dta `2' /* name of category dta file */

insheet using `cat_csv', clear names

	*rename the vars and replace missing to 0
	foreach type of varlist transaction_amount* { 
			*extract cat name
				local cat = substr("`type'",19,.)
			ren `type' x_`cat'
			replace x_`cat' = 0 if mi(x_`cat')
		}
	
	*convert date
		ren date date_str
		gen date = date(date_str,"YMD")
				drop date_str
				format date %td

	save `cat_dta', replace

di "Input from `cat_csv' and Output to `cat_dta'"

end
}


*CALCULATE PAYROLL PATTERN VARIABLE
{
capture program drop calculate_payroll_schedule
program define calculate_payroll_schedule

local dta_file `1' /* name of the category csv file */
local payroll_file `2' /* name of category dta file */

*create the strict sample
use user_id date freq pay using `dta_file' if freq ==3 & pay == 1 , clear
	*create week
	*date before when you want to start (using thur) 
		generate date_since_start=date-td(06dec2012)
	*take floor and divide by 7 to get week
		gen week = floor(date_since_start/7)
	
	*create cutoff
	gen odd_week = mod(week,2)
	egen odd_mean = mean(odd_week), by(user_id)
	gen odd = 1 if odd_mean >= 0.9
		replace odd = 0 if odd_mean <= 0.1
	drop if mi(odd)
	
	*keep and save
		keep user_id odd
			duplicates drop
		save `payroll_file', replace
end
}

*PROCESS COMBINED DATA
{
capture program drop process_combined_data
program define process_combined_data

local merged_dta `1' /* name of merged dta file  */
local payroll_file `2' /* name of category dta file */
local processed_dta `3' /* name of processed dta file */

*DEFINE PAYROLL SCHEDULE 
	SQL_data_load "calculate_payroll_schedule" `merged_dta' `payroll_file'

	use `merged_dta', clear
	
	*Aug 9 is duplicated for some and causes balances to spike up. Replace it to missing.
				replace liquid_balance = . if date == td(9aug2013)
	*merge in bi-weekly identifier
			mmerge user_id using `payroll_file', t(n:1)
			drop if _merge == 2			
	*standardize naming and gross up for payroll tax
			ren paycheck_amt y_paycheck
			replace y_paycheck = y_paycheck/0.7			
	*collapse by week
		*date before when you want to start 
			generate date_since_start=date-td(06dec2012)
		*take floor and divide by 7 to get week
			gen week_date = floor(date_since_start/7)				
			collapse (sum) x* y y_paycheck (mean) freq liquid_balance odd  (min) date, by(user_id week_date)
				drop if mi(freq)
		*drop min and max week per person
			egen min_week = min(week_date), by(user_id)
			egen max_week = max(week_date), by(user_id)
				drop if inlist(week_date,min_week,max_week)
				drop min_week max_week
		
		compress
		save `processed_dta', replace
end
}


*COMBINE PROCESSED DATA
{
capture program drop append_data
program define append_data

local processed_dta `1' /* name of processed dta file  */

*paste the after collapse pieces together
	use `processed_dta'0.dta , clear
		forval i = 1/9{
			 append using `processed_dta'`i'.dta
				
			}
		compress
	save `processed_dta'merged.dta, replace	

end
}


*IMPORT TAX REFUND DATA 
{
capture program drop load_refund_data
program define load_refund_data

local rawfile `1' /* name of the file the raw sql output is saved to */
local dtafile `2' /* name of stata file the sql output is saved to */
local partition `3' /* partition number */

#delimit ;
!echo "
	/*Grabs tax refund date and amount */
	select a.user_id,date
		FROM transactions_data_test_user_all partition(`partition') as a use index()
	inner join sample_selection_SC partition(`partition') as sample 
				on a.user_id = sample.user_id 
	 where ((lower(transaction_description) like '%tax%' AND lower(transaction_description) like '%treas%'
		AND lower(transaction_description) like '%ref%')
			OR
		lower(transaction_description) like '%sbtpg%') AND is_credit_bin=1 and tx_status!='pending'
	group by a.user_id, date;"
		| mysql -uroot -pcheck123 main > `rawfile' ;
#delimit cr
				
*import the sql output to and create a stata dta file
	insheet using `rawfile', clear names
		*create stata date
			ren date date_str
			gen date = date(date_str,"YMD")
				drop date_str
				format date %td			
			gen year = year(date)

		*only keep if 1 tax refund
			egen count = count(date), by(user_id year)
			keep if count == 1 
			drop year
		*create week
			generate date_since_start=date-td(06dec2012)
		*take floor and divide by 7 to get week
			gen week_date = floor(date_since_start/7)
			keep user_id week_date
		save `dtafile', replace
end
}

